import numpy as np
import pandas as pd
import plotly.express as px
import wrds
pd.options.display.max_columns=200
conn=wrds.Connection()
WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully. Loading library list... Done
auditop_vars= conn.describe_table(library='audit', table='auditopin')['name']
print(len(auditop_vars))
Approximately 343544 rows in audit.auditopin. 225
vars_to_skip= ['matchfy', 'matchqu', 'closestfy', 'closestqu', 'priorqu', 'priorfy', 'note', 'opinion_text', 'eventdate']
for idx, var in enumerate(auditop_vars):
if any(var_to_skip in var for var_to_skip in vars_to_skip):
pass
else:
print(idx, var)
0 audit_op_key 1 auditor_affil_fkey 2 auditor_affilname 3 auditor_fkey 4 auditor_name 5 sig_date_of_op_x 6 sig_date_of_op_s 7 going_concern 8 auditor_city 9 auditor_state 10 auditor_state_name 11 auditor_country 12 auditor_region 13 auditor_con_sup_reg 14 fiscal_year_of_op 15 fiscal_year_end_op 16 op_aud_pcaob 17 pcaob_reg_num 23 accnt_basis 24 ftp_file_fkey 25 form_fkey 26 file_date 27 file_accepted 28 file_size 29 http_name_html 30 http_name_text 31 company_fkey 32 best_edgar_ticker
df= conn.raw_sql(f"""select
audit_op_key,
auditor_fkey as afk,
auditor_name as afn,
going_concern as gco,
auditor_country as af_ctry,
fiscal_year_of_op,
form_fkey,
company_fkey as c,
best_edgar_ticker as t
from audit.auditopin
""")
print(df.shape)
(343544, 9)
vars_to_keep= ['audit_op_key', 'auditor_fkey', 'auditor_name', 'going_concern', 'auditor_country',
'fiscal_year_of_op', 'form_fkey', 'company_fkey', 'best_edgar_ticker']
df_test= conn.get_table(library= 'audit', table='auditopin',
columns= vars_to_keep,
obs= 3)
df_test
| audit_op_key | auditor_fkey | auditor_name | going_concern | auditor_country | fiscal_year_of_op | form_fkey | company_fkey | best_edgar_ticker | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 66900.0 | 5.0 | Arthur Andersen LLP | 0.0 | USA | 2000.0 | 10-K | 0000000020 | None |
| 1 | 78252.0 | 5.0 | Arthur Andersen LLP | 0.0 | USA | 2001.0 | 10-K405 | 0000000020 | None |
| 2 | 3538.0 | 4.0 | KPMG LLP | 0.0 | USA | 2002.0 | 10-K | 0000000020 | None |
df= conn.get_table(library= 'audit', table='auditopin',
columns= vars_to_keep)
print(df.shape)
df.head()
(343544, 9)
| audit_op_key | auditor_fkey | auditor_name | going_concern | auditor_country | fiscal_year_of_op | form_fkey | company_fkey | best_edgar_ticker | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 66900.0 | 5.0 | Arthur Andersen LLP | 0.0 | USA | 2000.0 | 10-K | 0000000020 | None |
| 1 | 78252.0 | 5.0 | Arthur Andersen LLP | 0.0 | USA | 2001.0 | 10-K405 | 0000000020 | None |
| 2 | 3538.0 | 4.0 | KPMG LLP | 0.0 | USA | 2002.0 | 10-K | 0000000020 | None |
| 3 | 39225.0 | 6.0 | Grant Thornton LLP | 0.0 | USA | 2003.0 | 10-K | 0000000020 | None |
| 4 | 84748.0 | 6.0 | Grant Thornton LLP | 0.0 | USA | 2004.0 | 10-K | 0000000020 | None |
df.rename(columns=
{'auditor_fkey':'afk',
'auditor_name':'afn',
'going_concern':'gco',
'auditor_country':'af_ctry',
'fiscal_year_of_op':'fy',
'company_fkey':'cik',
'best_edgar_ticker':'tic'},
inplace= True)
df.groupby(['afn','afk']).count().sort_values('cik', ascending=False).head(20)
| audit_op_key | gco | af_ctry | fy | form_fkey | cik | tic | ||
|---|---|---|---|---|---|---|---|---|
| afn | afk | |||||||
| Deloitte & Touche LLP | 3.0 | 56507 | 56507 | 56326 | 56507 | 56507 | 56507 | 11497 |
| Ernst & Young LLP | 2.0 | 50808 | 50808 | 50627 | 50808 | 50808 | 50808 | 16659 |
| PricewaterhouseCoopers LLP | 1.0 | 50565 | 50565 | 50247 | 50565 | 50565 | 50565 | 13439 |
| KPMG LLP | 4.0 | 36703 | 36703 | 36570 | 36703 | 36703 | 36703 | 11808 |
| Grant Thornton LLP | 6.0 | 31512 | 31512 | 31506 | 31512 | 31511 | 31512 | 3188 |
| Arthur Andersen LLP | 5.0 | 6398 | 6398 | 6397 | 6398 | 6398 | 6398 | 901 |
| BDO USA LLP | 11761.0 | 3137 | 3137 | 3132 | 3137 | 3137 | 3137 | 1939 |
| BDO Seidman LLP | 7.0 | 2964 | 2964 | 2944 | 2964 | 2964 | 2964 | 675 |
| MaloneBailey LLP | 151.0 | 2552 | 2552 | 2550 | 2552 | 2552 | 2552 | 1044 |
| Marcum LLP | 8256.0 | 2250 | 2250 | 2248 | 2250 | 2250 | 2250 | 1628 |
| Tait Weller & Baker LLP | 142.0 | 1949 | 1949 | 1949 | 1949 | 1949 | 1949 | 40 |
| McGladrey & Pullen LLP | 10.0 | 1550 | 1550 | 1546 | 1550 | 1550 | 1550 | 467 |
| Moss Adams LLP | 13.0 | 1399 | 1399 | 1399 | 1399 | 1399 | 1399 | 647 |
| RSM US LLP | 26516.0 | 1139 | 1139 | 1139 | 1139 | 1139 | 1139 | 726 |
| BKD LLP | 11.0 | 1058 | 1058 | 1053 | 1058 | 1058 | 1058 | 412 |
| Crowe Horwath LLP (Inactive) | 2830.0 | 1038 | 1038 | 1038 | 1038 | 1038 | 1038 | 501 |
| M&K CPAS PLLC | 3143.0 | 1002 | 1002 | 1001 | 1002 | 1002 | 1002 | 472 |
| WithumSmith + Brown PC | 52.0 | 964 | 964 | 964 | 964 | 964 | 964 | 681 |
| BBD LLP | 11308.0 | 913 | 913 | 913 | 913 | 913 | 913 | 10 |
| Crowe Chizek & Company LLP | 8.0 | 892 | 892 | 890 | 892 | 892 | 892 | 214 |
df=df[df.fy>=2000]
df=df[df.fy<=2020]
df['big5']= (df['afk']<= 5).astype(int)
print(df.shape)
df.head()
(324416, 10)
| audit_op_key | afk | afn | gco | af_ctry | fy | form_fkey | cik | tic | big5 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 66900.0 | 5.0 | Arthur Andersen LLP | 0.0 | USA | 2000.0 | 10-K | 0000000020 | None | 1 |
| 1 | 78252.0 | 5.0 | Arthur Andersen LLP | 0.0 | USA | 2001.0 | 10-K405 | 0000000020 | None | 1 |
| 2 | 3538.0 | 4.0 | KPMG LLP | 0.0 | USA | 2002.0 | 10-K | 0000000020 | None | 1 |
| 3 | 39225.0 | 6.0 | Grant Thornton LLP | 0.0 | USA | 2003.0 | 10-K | 0000000020 | None | 0 |
| 4 | 84748.0 | 6.0 | Grant Thornton LLP | 0.0 | USA | 2004.0 | 10-K | 0000000020 | None | 0 |
df[df.gco==1].groupby(['fy']).count()['cik'].plot(kind='bar', figsize=(10,6))
<AxesSubplot:xlabel='fy'>
df.head()
| audit_op_key | afk | afn | gco | af_ctry | fy | form_fkey | cik | tic | big5 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 66900.0 | 5.0 | Arthur Andersen LLP | 0.0 | USA | 2000.0 | 10-K | 0000000020 | None | 1 |
| 1 | 78252.0 | 5.0 | Arthur Andersen LLP | 0.0 | USA | 2001.0 | 10-K405 | 0000000020 | None | 1 |
| 2 | 3538.0 | 4.0 | KPMG LLP | 0.0 | USA | 2002.0 | 10-K | 0000000020 | None | 1 |
| 3 | 39225.0 | 6.0 | Grant Thornton LLP | 0.0 | USA | 2003.0 | 10-K | 0000000020 | None | 0 |
| 4 | 84748.0 | 6.0 | Grant Thornton LLP | 0.0 | USA | 2004.0 | 10-K | 0000000020 | None | 0 |
df_b6= df[df.afk<=6]
df_b6=df_b6[['afn', 'fy']]
df_b6.head()
| afn | fy | |
|---|---|---|
| 0 | Arthur Andersen LLP | 2000.0 |
| 1 | Arthur Andersen LLP | 2001.0 |
| 2 | KPMG LLP | 2002.0 |
| 3 | Grant Thornton LLP | 2003.0 |
| 4 | Grant Thornton LLP | 2004.0 |
df_b6.head(2)
| afn | fy | |
|---|---|---|
| 0 | Arthur Andersen LLP | 2000.0 |
| 1 | Arthur Andersen LLP | 2001.0 |
df_cross= pd.crosstab(df_b6.fy, df_b6.afn).reset_index()
df_cross.head()
| afn | fy | Arthur Andersen LLP | Deloitte & Touche LLP | Ernst & Young LLP | Grant Thornton LLP | KPMG LLP | PricewaterhouseCoopers LLP |
|---|---|---|---|---|---|---|---|
| 0 | 2000.0 | 2549 | 2884 | 3306 | 981 | 1998 | 3098 |
| 1 | 2001.0 | 2357 | 2724 | 2662 | 987 | 2154 | 2973 |
| 2 | 2002.0 | 196 | 3177 | 2863 | 955 | 2284 | 2875 |
| 3 | 2003.0 | 0 | 3232 | 3079 | 926 | 2295 | 3300 |
| 4 | 2004.0 | 0 | 2698 | 2785 | 1164 | 2175 | 2714 |
df_cross_melted= pd.melt(df_cross, id_vars='fy')
df_cross_melted.rename(columns={'value':'count'}, inplace= True)
df_cross_melted.head()
| fy | afn | count | |
|---|---|---|---|
| 0 | 2000.0 | Arthur Andersen LLP | 2549 |
| 1 | 2001.0 | Arthur Andersen LLP | 2357 |
| 2 | 2002.0 | Arthur Andersen LLP | 196 |
| 3 | 2003.0 | Arthur Andersen LLP | 0 |
| 4 | 2004.0 | Arthur Andersen LLP | 0 |
df_cross_melted['bubble_size']= df_cross_melted['count']*2+100
df_cross_melted.head()
| fy | afn | count | bubble_size | |
|---|---|---|---|---|
| 0 | 2000.0 | Arthur Andersen LLP | 2549 | 5198 |
| 1 | 2001.0 | Arthur Andersen LLP | 2357 | 4814 |
| 2 | 2002.0 | Arthur Andersen LLP | 196 | 492 |
| 3 | 2003.0 | Arthur Andersen LLP | 0 | 100 |
| 4 | 2004.0 | Arthur Andersen LLP | 0 | 100 |
fig=px.scatter(df_cross_melted, x='fy', y='count',
hover_name='afn',
animation_frame='fy',
animation_group='afn',
color='afn',
size='bubble_size',
size_max=30,
range_x= [2000,2020],
range_y=[-100,3500],
height=600,
labels=dict(fy='Fiscal Year', count='# of Clients', afn='Audit Firm Name'),
title="The Number of Clients for Big 6 Audit Firms"
)
fig.show()
fig.write_html("auditor-dist.html")
df_dist= pd.crosstab(df_b6.fy, df_b6.afn)
df_dist.head()
| afn | Arthur Andersen LLP | Deloitte & Touche LLP | Ernst & Young LLP | Grant Thornton LLP | KPMG LLP | PricewaterhouseCoopers LLP |
|---|---|---|---|---|---|---|
| fy | ||||||
| 2000.0 | 2549 | 2884 | 3306 | 981 | 1998 | 3098 |
| 2001.0 | 2357 | 2724 | 2662 | 987 | 2154 | 2973 |
| 2002.0 | 196 | 3177 | 2863 | 955 | 2284 | 2875 |
| 2003.0 | 0 | 3232 | 3079 | 926 | 2295 | 3300 |
| 2004.0 | 0 | 2698 | 2785 | 1164 | 2175 | 2714 |
fig=px.scatter(df_dist)
fig.show()
df_dist= df_dist.reset_index()
df_dist.head()
| afn | fy | Arthur Andersen LLP | Deloitte & Touche LLP | Ernst & Young LLP | Grant Thornton LLP | KPMG LLP | PricewaterhouseCoopers LLP |
|---|---|---|---|---|---|---|---|
| 0 | 2000.0 | 2549 | 2884 | 3306 | 981 | 1998 | 3098 |
| 1 | 2001.0 | 2357 | 2724 | 2662 | 987 | 2154 | 2973 |
| 2 | 2002.0 | 196 | 3177 | 2863 | 955 | 2284 | 2875 |
| 3 | 2003.0 | 0 | 3232 | 3079 | 926 | 2295 | 3300 |
| 4 | 2004.0 | 0 | 2698 | 2785 | 1164 | 2175 | 2714 |
df_melted= pd.melt(df_dist, id_vars=['fy'])
df_melted.head()
| fy | afn | value | |
|---|---|---|---|
| 0 | 2000.0 | Arthur Andersen LLP | 2549 |
| 1 | 2001.0 | Arthur Andersen LLP | 2357 |
| 2 | 2002.0 | Arthur Andersen LLP | 196 |
| 3 | 2003.0 | Arthur Andersen LLP | 0 |
| 4 | 2004.0 | Arthur Andersen LLP | 0 |
fig=px.scatter(df_melted, x='fy', y='value',
hover_name='afn',
color='afn',
size='value',
range_x= [2000,2020],
range_y=[-100,4000],
height=600)
fig.show()